## This program employs a mann whitney u test to compare two trash runs
## Inputs: staph genes file, both conditions' IGV files
## >> python mannwhitneyu.py gene_non-coding_table.txt ___.igv ____.igv > output.txt

#from __future__ import division
import sys, random, scipy, numpy
from math import *
from scipy import stats


## DEFINE GENE NAMES, STARTS, ENDS

genes = {}; genereads = {}

init = 0
count = 1

#for line in open(sys.argv[1]):
#    split = line.split('\t')
#    if split[0][0:3] == 'SAO':
#        name = split[0]; start = int(split[1]); end = int(split[2])
#        genes[name] = [start, end]
        #print genes
#        genereads[name] = [[],[]]


for line in open(sys.argv[1]):
    split = line.split('\t')
    name = split[0]; start = int(split[1]); end = int(split[2])
    genes[name] = [start, end]
    genereads[name] = [[],[]]
#print genes    

        

        
genelist = genes.keys()
genelist.sort()
#print genelist
#print genereads
## DEFINE RATIO BETWEEN TOTAL READ COUNTS AS CORRECTION

Lib1Reads = 0; Lib2Reads = 0
Lib1TA = 0; Lib2TA = 0

for line in open(sys.argv[2]):
    #print line
    split = line.split()
    #print split[3][0]
    if len(split)>1 and split[3][0].isdigit()==True:
        Lib1TA += 1
        Lib1Reads += float(split[3])
#print Lib1Reads

for line in open(sys.argv[3]):
    split = line.split()
    if len(split)>1 and split[3][0].isdigit()==True:
        Lib2TA += 1
        Lib2Reads += float(split[3])

ratio = float(Lib1Reads)/float(Lib2Reads)

## SET DICTIONARIES WITH GENE READS

## set first for Library 1
for line in open(sys.argv[2]):
    split = line.split()
    #print split
    if len(split) > 4 and split[3][0].isdigit() == True and split[4] in genereads:
        site = int(split[1]); start = int(genes[split[4]][0]); end = int(genes[split[4]][1])
        length = end - start
        if (start+(0.03*length)) <= site <= (end-(0.03*length)):
            genereads[split[4]][0].append(float(split[3]))
    #else:
    # print split [4]

## set for Library 2
for line in open(sys.argv[3]):
    split = line.split()
    if len(split) > 4 and split[3][0].isdigit() == True and split[4] in genereads:
        site = int(split[1]); start = int(genes[split[4]][0]); end = int(genes[split[4]][1])
        length = end - start
        if (start+(0.03*length)) <= site <= (end-(0.03*length)):
            genereads[split[4]][1].append(float(split[3]))

## MANN-WHITNEY U TEST FOR ALL GENES
#print genereads
#print len(genelist)
for i in range(len(genelist)):
    #print genelist[i]
    SAO = genelist[i]
    start = genes[SAO][0]; end = genes[SAO][1]
    #print end
    Lib1Counts = genereads[SAO][0]
    Lib2Counts = genereads[SAO][1]
    TA = len(Lib1Counts)
    readslib1 = sum(Lib1Counts)
    readslib2 = sum(Lib2Counts)
    length = end - start
    if length > 0:
        index1 = readslib1/length
        index2 = readslib2/length
    if TA == 0:
        print "%s\t%d\t%s" % (SAO,TA,'Gene Has No TAs')
    else:
        if sum(Lib1Counts) == 0 and sum(Lib2Counts) ==0:
            print "%s\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%r\t%r" % (SAO,TA,'0','0','0','0','0',length,'0','0',start,end)
        else:
            try:
                U, p_val = scipy.stats.mannwhitneyu(Lib1Counts,Lib2Counts)
                CountRatio = float(sum(Lib2Counts)+1)/float(sum(Lib1Counts)+1)
                #print "%s\t%d\t%d\t%0.5f\t%0.3f" % (SAO,TA,U,p_val,CountRatio)
                print "%s\t%d\t%d\t%d\t%d\t%0.100f\t%0.10f\t%d\t%0.10f\t%0.10f\t%r\t%r" % (SAO,TA,readslib1,readslib2,U,p_val,CountRatio,length,index1,index2,start,end)
            except (ValueError):
                pass
      


        

